{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "from sklearn.ensemble import RandomForestClassifier\n",
    "from sklearn.ensemble import GradientBoostingClassifier\n",
    "import datetime"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "\"\"\"地点\"\"\"\n",
    "where = [\"地点21_mean\",\"地点213_count\",\"地点72_sum\",\"地点283_sum\",\"地点1733_count\",\"地点118_sum\",\"地点277_sum\",\"地点232_count\",\n",
    "        \"地点2008_min\",\"地点69_count\",\"地点465_min\",\"地点232_mean\",\"地点179_sum\",\"地点1113_sum\",\"地点1155_count\",\"地点879_count\",\n",
    "        \"地点972_sum\",\"地点72_count\",\"地点86_count\",\"地点1760_min\",\"地点241_median\",\"地点282_mean\",\"地点270_sum\",\"地点118_max\",\n",
    "        \"地点585_sum\",\"地点65_max\",\"地点263_mean\",\"地点1708_max\",\"地点956_mean\",\"地点664_max\",\"地点550_mean\",\"地点915_mean\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "\"\"\"counsume\"\"\"\n",
    "consume = [\"换卡_count\",\"POS消费_mean\",\"卡充值_mean\",\"POS消费_sum\",\n",
    "        \"POS消费_count\",\"圈存转账_sum\",\"POS消费_max\",\"圈存转账_mean\",\"POS消费_min\",\"卡充值_sum\",\"圈存转账_count\",\"卡补办_sum\",\n",
    "        \"卡充值_count\",\"支付领取_min\",\"POS消费_median\",\"卡片销户_min\",\"卡补办_count\",\"支付领取_sum\",\"支付领取_count\",\"圈存转账_min\",\n",
    "        \"卡充值_min\",\"更新卡信息_max\",\"卡挂失_count\",\"卡补办_median\",\"换卡_mean\",\"换卡_sum\",\"卡补办_max\",\"换卡_min\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "\"\"\"time\"\"\"\n",
    "time = [\"2015-05-21_mean\",\"2014-09-05_mean\",\"2014-09-01_mean\",\"2014-10-27_mean\",\"2015-07-19_median\",\"2013-09-02_sum\",\n",
    "        \"2014-12-10_min\",\"2014-09-26_mean\",\"2015-03-11_sum\",\"2015-02-05_min\",\"2015-07-23_count\",\n",
    "       \"2014-12-24_count\",\"2013-09-23_sum\",\"2015-06-29_min\",\"2014-09-08_sum\",\"2014-04-03_min\",\n",
    "       \"2014-11-21_sum\",\"2015-06-04_mean\",\"2014-08-07_max\",\"2014-09-24_mean\",\"2014-10-07_mean\",\n",
    "       \"2014-09-07_sum\",\"2015-05-22_median\",\"2015-01-04_min\",\"2015-05-23_sum\",\"2015-07-20_mean\",\n",
    "       \"2015-05-30_mean\",\"2014-09-22_mean\",\"2014-12-26_median\",\"2014-07-01_max\",\"2014-10-17_min\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "\"\"\"HourofDay\"\"\"\n",
    "time_HourofDay = [\"HourofDay17_count\",\"HourofDay12_count\",\"HourofDay9_count\",\"HourofDay16_count\",\"HourofDay11_count\",\"HourofDay11_mean\",\n",
    "                   \"HourofDay23_min\",\"HourofDay18_mean\",\"HourofDay15_sum\",\"HourofDay7_sum\",\"HourofDay8_median\",\"HourofDay17_mean\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "\"\"\"MonthofYeah\"\"\"\n",
    "time_MonthofYeah = [\"MonthofYeah8_count\",\"MonthofYeah3_min\",\"MonthofYeah1_count\",\"MonthofYeah9_sum\",\"MonthofYeah5_min\",\"MonthofYeah7_mean\",\n",
    "                   \"MonthofYeah8_median\",\"MonthofYeah4_min\",\"MonthofYeah9_min\",\"MonthofYeah1_median\",\"MonthofYeah7_count\",\"MonthofYeah4_mean\",\n",
    "                   \"MonthofYeah10_count\",\"MonthofYeah6_min\",\"MonthofYeah12_count\",\"MonthofYeah8_sum\",\"MonthofYeah7_sum\",\"MonthofYeah8_mean\",\n",
    "                   \"MonthofYeah11_mean\",\"MonthofYeah2_count\",\"MonthofYeah9_max\",\"MonthofYeah10_min\",\"MonthofYeah9_median\",\"MonthofYeah7_max\",\n",
    "                   \"MonthofYeah3_sum\",\"MonthofYeah1_sum\",\"MonthofYeah11_min\",\"MonthofYeah12_median\",\"MonthofYeah5_median\",\"MonthofYeah12_sum\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "\"\"\"DayofYeah\"\"\"\n",
    "time_DayofYeah =[\"DayofYeah250_sum\",\"DayofYeah251_sum\",\"DayofYeah63_sum\",\"DayofYeah248_sum\",\"DayofYeah289_median\",\"DayofYeah193_min\",\n",
    "                   \"DayofYeah295_max\",\"DayofYeah282_max\",\"DayofYeah187_min\",\"DayofYeah9_min\",\"DayofYeah142_median\",\"DayofYeah129_sum\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "\"\"\"DayofWeek\"\"\"\n",
    "time_DayofWeek = [\"DayofWeek5_count\",\"DayofWeek2_min\",\"DayofWeek0_mean\",\"DayofWeek3_min\",\"DayofWeek1_sum\",\"DayofWeek2_mean\",\"DayofWeek1_count\",\n",
    "                 \"DayofWeek0_min\",\"DayofWeek5_median\",\"DayofWeek1_median\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "\"\"\"DayofMonth\"\"\"\n",
    "time_DayofMonth =[\"DayofMonth2_sum\",\"DayofMonth9_sum\",\"DayofMonth16_min\",\"DayofMonth19_median\",\"DayofMonth9_min\",\"DayofMonth18_max\",\n",
    "                   \"DayofMonth8_sum\",\"DayofMonth17_mean\",\"DayofMonth2_median\",\"DayofMonth1_min\",\"DayofMonth14_min\",\"DayofMonth4_sum\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "\"\"\"time_MonthofYeah_how\"\"\"\n",
    "time_MonthofYeah_how =[\"time_MonthofYeah_how_mean_26\",\"time_MonthofYeah_how_sum_26\",\"time_MonthofYeah_how_max_17\",\"time_MonthofYeah_how_max_26\",\n",
    "                       \"time_MonthofYeah_how_count_67\",\"time_MonthofYeah_how_sum_40\",\n",
    "                   \"time_MonthofYeah_how_mean_34\",\"time_MonthofYeah_how_mean_22\",\"time_MonthofYeah_how_min_44\",\"time_MonthofYeah_how_count_23\",\n",
    "                       \"time_MonthofYeah_how_max_8\",\"time_MonthofYeah_how_max_34\",\"time_MonthofYeah_how_min_19\",\"time_MonthofYeah_how_mean_65\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "\"\"\"feature_card_time_MonthofYeah_consume\"\"\"\n",
    "time_MonthofYeah_consume = [\"time_MonthofYeah_consume_count_145\",\"time_MonthofYeah_consume_sum_8\",\"time_MonthofYeah_consume_count_25\",\n",
    "                            \"time_MonthofYeah_consume_median_0\",\"time_MonthofYeah_consume_count_14\",\"time_MonthofYeah_consume_min_17\",\n",
    "                   \"time_MonthofYeah_consume_sum_2\",\"time_MonthofYeah_consume_count_8\",\"time_MonthofYeah_consume_median_14\",\"time_MonthofYeah_consume_min_4\",\n",
    "                       \"time_MonthofYeah_consume_mean_0\",\"time_MonthofYeah_consume_mean_12\",\"time_MonthofYeah_consume_sum_23\",\n",
    "                            \"time_MonthofYeah_consume_count_0\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "\"\"\"feature_card_time_HourofDay_how\"\"\"\n",
    "time_HourofDay_how = [\"time_HourofDay_how_count_44\",\"time_HourofDay_how_mean_13\",\"time_HourofDay_how_median_39\",\n",
    "                            \"time_HourofDay_how_count_7\",\"time_HourofDay_how_count_15\",\"time_HourofDay_how_max_6\",\n",
    "                   \"time_HourofDay_how_sum_6\",\"time_HourofDay_how_max_11\",\"time_HourofDay_how_count_42\",\"time_HourofDay_how_count_61\",\n",
    "                       \"time_HourofDay_how_max_109\",\"time_HourofDay_how_min_18\",\"time_HourofDay_how_sum_38\",\n",
    "                            \"time_HourofDay_how_max_44\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "\"\"\"feature_card_time_HourofDay_consume\"\"\"\n",
    "time_HourofDay_consume =  [\"time_HourofDay_consume_count_18\",\"time_HourofDay_consume_count_6\",\"time_HourofDay_consume_mean_18\",\n",
    "                            \"time_HourofDay_consume_count_10\",\"time_HourofDay_consume_sum_4\",\"time_HourofDay_consume_mean_8\",\n",
    "                   \"time_HourofDay_consume_count_7\",\"time_HourofDay_consume_count_16\",\"time_HourofDay_consume_max_18\",\"time_HourofDay_consume_max_10\",\n",
    "                       \"time_HourofDay_consume_mean_5\",\"time_HourofDay_consume_min_25\",\"time_HourofDay_consume_min_1\",\n",
    "                            \"time_HourofDay_consume_sum_23\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "\"\"\"feature_card_time_DayofWeek_how\"\"\"\n",
    "time_DayofWeek_how =  [\"time_DayofWeek_how_max_5\",\"time_DayofWeek_how_mean_10\",\"time_DayofWeek_how_mean_5\",\n",
    "                            \"time_DayofWeek_how_min_23\",\"time_DayofWeek_how_sum_39\",\"time_DayofWeek_how_min_18\",\n",
    "                   \"time_DayofWeek_how_count_32\",\"time_DayofWeek_how_sum_36\",\"time_DayofWeek_how_mean_21\",\"time_DayofWeek_how_max_21\",\n",
    "                       \"time_DayofWeek_how_count_45\",\"time_DayofWeek_how_max_33\",\"time_DayofWeek_how_min_24\",\n",
    "                            \"time_DayofWeek_how_sum_22\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "\"\"\"feature_card_time_DayofWeek_consume\"\"\"\n",
    "time_DayofWeek_consume = [\"time_DayofWeek_consume_count_10\",\"time_DayofWeek_consume_max_12\",\"time_DayofWeek_consume_mean_6\",\n",
    "                            \"time_DayofWeek_consume_sum_2\",\"time_DayofWeek_consume_sum_0\",\"time_DayofWeek_consume_sum_52\",\n",
    "                   \"time_DayofWeek_consume_median_2\",\"time_DayofWeek_consume_min_4\",\"time_DayofWeek_consume_sum_10\",\"time_DayofWeek_consume_sum_6\",\n",
    "                       \"time_DayofWeek_consume_max_6\",\"time_DayofWeek_consume_min_48\",\"time_DayofWeek_consume_min_10\",\n",
    "                            \"time_DayofWeek_consume_sum_11\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "score_train = pd.read_table('data/train/score_train.txt',sep=',',header=-1)\n",
    "score_train.columns = ['id','college','score']\n",
    "score_test = pd.read_table('data/test/score_final_test.txt',sep=',',header=-1)\n",
    "score_test.columns = ['id','college','score']\n",
    "score_train_test = pd.concat([score_train,score_test])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "college_train_test = score_train_test.groupby([\"id\",\"college\"])[\"score\"].max().unstack([\"college\"]).fillna(0)\n",
    "college_name_list = list(college_train_test.columns)\n",
    "college_train_test = college_train_test.rename(columns=dict(zip(college_name_list,[\"college_\"+str(i) for i in college_name_list]))).reset_index()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "college = pd.DataFrame(score_train_test.groupby(['college'])['score'].max()).reset_index()\n",
    "college.columns = ['college','college_num']\n",
    "score_train_test = pd.merge(score_train_test, college, how='left',on='college')\n",
    "score_train_test['college_orderate'] = score_train_test['score']/score_train_test['college_num']\n",
    "del score_train_test[\"college\"]\n",
    "#del score_train_test[\"score\"]\n",
    "score_train_test = pd.merge(score_train_test,college_train_test,how=\"left\",on=\"id\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "card_train = pd.read_table('data/train/card_train.txt',sep=',',header=-1)\n",
    "card_train.columns = ['id','consume','where','how','time','amount','remainder']\n",
    "card_test = pd.read_table('data/test/card_final_test.txt',sep=',',header=-1)\n",
    "card_test.columns = ['id','consume','where','how','time','amount','remainder']\n",
    "card_train_test = pd.concat([card_train,card_test])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def Time_to_num(das):\n",
    "    tmp = pd.DatetimeIndex(das)\n",
    "    daop = pd.DataFrame(dict(zip(*[[\"{}_{}\".format(das.name, i) for i in [\"MonthofYeah\",\"DayofYeah\", \"DayofMonth\", \"DayofWeek\", \"HourofDay\"]], \n",
    "                                   [ tmp.month,tmp.dayofyear, tmp.day, tmp.dayofweek,tmp.hour]])),\n",
    "                        index = das.index)\n",
    "    return(daop)\n",
    "\n",
    "card_train_1 = pd.read_table('data/train/card_train.txt',sep=',', parse_dates = [4],header=-1)\n",
    "card_train_1.columns = ['id','consume','where','how','time','amount','remainder']\n",
    "card_test_1 = pd.read_table('data/test/card_final_test.txt',sep=',', parse_dates = [4],header=-1)\n",
    "card_test_1.columns = ['id','consume','where','how','time','amount','remainder']\n",
    "card_train_test_1 = pd.concat([card_train_1,card_test_1])\n",
    "card_train_test = pd.concat([card_train_test,Time_to_num(card_train_test_1[\"time\"])],axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "card = pd.DataFrame(card_train_test.groupby(['id'])[\"time\"].count()).reset_index().rename(columns={\"time\":\"cardActionNum\"})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "\"\"\"剩余金额上，“和”这个概念是不应该的，因此次数越多和就越大，所以一个人的和越大，不一定是体现出消费能力，可能只是比别人多几次消费\"\"\"\n",
    "\"\"\"用剩余金额的均值、众数、最小值和最大值还是有一定的理论意义的\"\"\"\n",
    "\"\"\"并且剩余金额这个特征是不需要依附其他特征而存在的，本身就可以比较\"\"\"\n",
    "#card['remaindersum'] = card_train_test.groupby(['id'])['remainder'].sum()\n",
    "card['remainderavg'] = card_train_test.groupby(['id'])['remainder'].mean()\n",
    "card['remaindermax'] = card_train_test.groupby(['id'])['remainder'].max()\n",
    "card['remaindermin'] = card_train_test.groupby(['id'])['remainder'].min()\n",
    "card['remaindermedian'] = card_train_test.groupby(['id'])['remainder'].median()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "card_train_test['time'] = card_train_test['time'].apply(lambda x: datetime.datetime.strptime(\n",
    "        x, \"%Y/%m/%d %H:%M:%S\").date())\n",
    "time_year = pd.DataFrame(card_train_test.groupby(['id'])[\"time\"].max()).reset_index()\n",
    "time_year['time_date'] = time_year['time'].apply(lambda x: 0 if x < datetime.date(2014, 9, 1) else 1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def singleFeatureDeal(str1,feature):\n",
    "    x_count = card_train_test.groupby([\"id\",str1])[feature].count().unstack([str1])\n",
    "    x_count = x_count.rename(columns=dict(zip(x_count,[str(i)+\"_count\" for i in x_count])))\n",
    "\n",
    "    x_sum = card_train_test.groupby([\"id\",str1])[feature].sum().unstack([str1])\n",
    "    x_sum = x_sum.rename(columns=dict(zip(x_sum,[str(i)+\"_sum\" for i in x_sum])))\n",
    "\n",
    "    x_mean = card_train_test.groupby([\"id\",str1])[feature].mean().unstack([str1])\n",
    "    x_mean = x_mean.rename(columns=dict(zip(x_mean,[str(i)+\"_mean\" for i in x_mean])))\n",
    "\n",
    "    x_max = card_train_test.groupby([\"id\",str1])[feature].max().unstack([str1])\n",
    "    x_max = x_max.rename(columns=dict(zip(x_max,[str(i)+\"_max\" for i in x_max])))\n",
    "\n",
    "    x_min = card_train_test.groupby([\"id\",str1])[feature].min().unstack([str1])\n",
    "    x_min = x_min.rename(columns=dict(zip(x_min,[str(i)+\"_min\" for i in x_min])))\n",
    "\n",
    "    x_median = card_train_test.groupby([\"id\",str1])[feature].median().unstack([str1])\n",
    "    x_median = x_median.rename(columns=dict(zip(x_median,[str(i)+\"_median\" for i in x_median])))\n",
    "    \n",
    "    x = pd.concat([x_count,x_sum,x_mean,x_max,x_min,x_median],axis=1).reset_index().fillna(0)\n",
    "    return x"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def singleFeatureDeal2(str1,feature):\n",
    "    x_count = card_train_test.groupby([\"id\",str1])[feature].count().unstack([str1])\n",
    "    x_count = x_count.rename(columns=dict(zip(x_count,[str1.strip(\"time_\")+str(i)+\"_count\" for i in x_count])))\n",
    "\n",
    "    x_sum = card_train_test.groupby([\"id\",str1])[feature].sum().unstack([str1])\n",
    "    x_sum = x_sum.rename(columns=dict(zip(x_sum,[str1.strip(\"time_\")+str(i)+\"_sum\" for i in x_sum])))\n",
    "\n",
    "    x_mean = card_train_test.groupby([\"id\",str1])[feature].mean().unstack([str1])\n",
    "    x_mean = x_mean.rename(columns=dict(zip(x_mean,[str1.strip(\"time_\")+str(i)+\"_mean\" for i in x_mean])))\n",
    "\n",
    "    x_max = card_train_test.groupby([\"id\",str1])[feature].max().unstack([str1])\n",
    "    x_max = x_max.rename(columns=dict(zip(x_max,[str1.strip(\"time_\")+str(i)+\"_max\" for i in x_max])))\n",
    "\n",
    "    x_min = card_train_test.groupby([\"id\",str1])[feature].min().unstack([str1])\n",
    "    x_min = x_min.rename(columns=dict(zip(x_min,[str1.strip(\"time_\")+str(i)+\"_min\" for i in x_min])))\n",
    "\n",
    "    x_median = card_train_test.groupby([\"id\",str1])[feature].median().unstack([str1])\n",
    "    x_median = x_median.rename(columns=dict(zip(x_median,[str1.strip(\"time_\")+str(i)+\"_median\" for i in x_median])))\n",
    "    \n",
    "    x = pd.concat([x_count,x_sum,x_mean,x_max,x_min,x_median],axis=1).reset_index().fillna(0)\n",
    "    return x"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def doubleFeatureDeal(str1,str2,feature):\n",
    "    xy_count = card_train_test.groupby([\"id\",str1,str2])[feature].count().unstack([str1,str2])\n",
    "    xy_count = pd.DataFrame(xy_count.values,index=xy_count.index).rename(columns=dict(zip(range(xy_count.shape[1]),\n",
    "                                                                                          [str1+\"_\"+str2+\"_count_\"+str(i) for i in range(xy_count.shape[1])])))\n",
    "    xy_sum = card_train_test.groupby([\"id\",str1,str2])[feature].sum().unstack([str1,str2])\n",
    "    xy_sum = pd.DataFrame(xy_sum.values,index=xy_sum.index).rename(columns=dict(zip(range(xy_sum.shape[1]),\n",
    "                                                                                          [str1+\"_\"+str2+\"_sum_\"+str(i) for i in range(xy_sum.shape[1])])))\n",
    "    xy_mean = card_train_test.groupby([\"id\",str1,str2])[feature].mean().unstack([str1,str2])\n",
    "    xy_mean = pd.DataFrame(xy_mean.values,index=xy_mean.index).rename(columns=dict(zip(range(xy_mean.shape[1]),\n",
    "                                                                                          [str1+\"_\"+str2+\"_mean_\"+str(i) for i in range(xy_mean.shape[1])])))\n",
    "    xy_max = card_train_test.groupby([\"id\",str1,str2])[feature].max().unstack([str1,str2])\n",
    "    xy_max = pd.DataFrame(xy_max.values,index=xy_max.index).rename(columns=dict(zip(range(xy_max.shape[1]),\n",
    "                                                                                          [str1+\"_\"+str2+\"_max_\"+str(i) for i in range(xy_max.shape[1])])))\n",
    "    xy_min = card_train_test.groupby([\"id\",str1,str2])[feature].min().unstack([str1,str2])\n",
    "    xy_min = pd.DataFrame(xy_min.values,index=xy_min.index).rename(columns=dict(zip(range(xy_min.shape[1]),\n",
    "                                                                                          [str1+\"_\"+str2+\"_min_\"+str(i) for i in range(xy_min.shape[1])])))\n",
    "    xy_median = card_train_test.groupby([\"id\",str1,str2])[feature].median().unstack([str1,str2])\n",
    "    xy_median = pd.DataFrame(xy_median.values,index=xy_median.index).rename(columns=dict(zip(range(xy_median.shape[1]),\n",
    "                                                                                          [str1+\"_\"+str2+\"_median_\"+str(i) for i in range(xy_median.shape[1])])))\n",
    "    \n",
    "    xy = pd.concat([xy_count,xy_sum,xy_mean,xy_max,xy_min,xy_median],axis=1).reset_index().fillna(0)\n",
    "    return xy"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "card_how_amount = singleFeatureDeal(\"how\",\"amount\")\n",
    "card_consume_amount = singleFeatureDeal(\"consume\",\"amount\")\n",
    "card_where_amount = singleFeatureDeal(\"where\",\"amount\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "card_where_amount.loc[:,[\"id\"]+where].to_csv(\"card_where_amount_top.csv\",encoding=\"GB18030\",index= False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "card_consume_amount.loc[:,[\"id\"]+consume].to_csv(\"card_consume_amount_top.csv\",encoding=\"GB18030\",index= False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "card_time_amount = singleFeatureDeal(\"time\",\"amount\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "card_time_amount.loc[:,[\"id\"]+time].to_csv(\"card_time_amount_top.csv\",encoding=\"GB18030\",index= False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "card_time_amount = singleFeatureDeal2(\"time_MonthofYeah\",\"amount\")\n",
    "card_time_amount.loc[:,[\"id\"]+time_MonthofYeah].to_csv(\"card_time_MonthofYeah_amount_top.csv\",encoding=\"GB18030\",index= False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "card_time_amount = singleFeatureDeal2(\"time_DayofYeah\",\"amount\")\n",
    "card_time_amount.loc[:,[\"id\"]+time_DayofYeah].to_csv(\"card_time_time_DayofYeah_amount_top.csv\",encoding=\"GB18030\",index= False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "card_time_amount = singleFeatureDeal2(\"time_DayofMonth\",\"amount\")\n",
    "card_time_amount.loc[:,[\"id\"]+time_DayofMonth].to_csv(\"card_time_DayofMonth_amount_top.csv\",encoding=\"GB18030\",index= False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "card_time_amount = singleFeatureDeal2(\"time_DayofWeek\",\"amount\")\n",
    "card_time_amount.loc[:,[\"id\"]+time_DayofWeek].to_csv(\"card_time_DayofWeek_amount_top.csv\",encoding=\"GB18030\",index= False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "card_time_amount = singleFeatureDeal2(\"time_HourofDay\",\"amount\")\n",
    "card_time_amount.loc[:,[\"id\"]+time_HourofDay].to_csv(\"card_time_HourofDay_amount_top.csv\",encoding=\"GB18030\",index= False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# train_test\n",
    "train = pd.read_table('../data/train/subsidy_train.txt',sep=',',header=-1)\n",
    "train.columns = ['id','money']\n",
    "test = pd.read_table('../data/test/studentID_test.txt',sep=',',header=-1)\n",
    "test.columns = ['id']\n",
    "test['money'] = np.nan\n",
    "train_test = pd.concat([train,test])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "train_test = pd.merge(train_test,score_train_test,how='left',on='id')\n",
    "train_test = pd.merge(train_test,card,how='left',on='id')\n",
    "train_test = pd.merge(train_test,time_year,how='left',on='id')\n",
    "train_test = pd.merge(train_test,card_how_amount,how='left',on='id')\n",
    "train_test = pd.merge(train_test,card_consume_amount.loc[:,[\"id\",\"换卡_count\",\"POS消费_mean\",\"卡充值_mean\",\"POS消费_sum\"\n",
    "                                                           ,\"POS消费_count\",\"圈存转账_sum\",\"POS消费_max\",\"POS消费_min\"\n",
    "                                                           ,\"圈存转账_mean\",\"卡充值_sum\" ,\"圈存转账_count\",\"卡充值_count\"\n",
    "                                                           ,\"卡补办_sum\"]],how='left',on='id')\n",
    "train_test = pd.merge(train_test,card_where_amount.loc[:,[\"id\",\"地点21_mean\",\"地点213_count\",\"地点72_sum\",\"地点283_sum\"\n",
    "                                                          ,\"地点232_mean\"]],how='left',on='id')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "dropList = [\"college_12\",\"college_18\",\"其他_median\",\"其他_sum\",\"time_date\",\"time\"]\n",
    "train_test.drop(dropList,axis=1, inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def singlePipeline(str1,feature):\n",
    "    card_time1_amount = singleFeatureDeal(str1,feature)\n",
    "    new_train_test = pd.merge(train_test,card_time1_amount,how='left',on='id')\n",
    "    \n",
    "    train = new_train_test.loc[train_test.money.notnull(),:]\n",
    "    test = new_train_test.loc[train_test.money.isnull(),:]\n",
    "    \n",
    "    #### Oversample\n",
    "    del test[\"money\"]\n",
    "    Oversampling1000 = train.loc[train.money == 1000]\n",
    "    Oversampling1500 = train.loc[train.money == 1500]\n",
    "    Oversampling2000 = train.loc[train.money == 2000]\n",
    "    for i in range(8):\n",
    "        train = train.append(Oversampling1000)\n",
    "    for j in range(11):\n",
    "        train = train.append(Oversampling1500)\n",
    "    for k in range(12):\n",
    "        train = train.append(Oversampling2000)\n",
    "\n",
    "    # model\n",
    "    train['isnull'] =  train.isnull().sum(axis=1)\n",
    "    test['isnull'] = test.isnull().sum(axis=1)\n",
    "    train = train.set_index([\"id\"]).fillna(-1)\n",
    "    test = test.set_index([\"id\"]).fillna(-1)\n",
    "    \n",
    "    clf = GradientBoostingClassifier(n_estimators=100,random_state=2016)\n",
    "    clf = clf.fit(train.iloc[:,1:],train.iloc[:,0])\n",
    "    predictions = clf.predict(test)\n",
    "\n",
    "    # Save results\n",
    "    pred = pd.DataFrame({'studentid': test.reset_index()['id'],\n",
    "                             'subsidy': predictions})\n",
    "    pred.subsidy = pred.subsidy.apply(lambda x:int(x))\n",
    "    print \"数据量:\", train.shape[0], \":\", pred.shape[0]\n",
    "    print \"有奖数量:\", sum(train['money'] != 0), \":\", sum(pred['subsidy'] != 0)\n",
    "    print \"1000奖学金数量：\", sum(train['money'] == 1000), sum(pred['subsidy'] == 1000)\n",
    "    print \"1500奖学金数量：\", sum(train['money'] == 1500), sum(pred['subsidy'] == 1500)\n",
    "    print \"2000奖学金数量：\", sum(train['money'] == 2000), sum(pred['subsidy'] == 2000)\n",
    "    \n",
    "    pd.DataFrame(clf.feature_importances_,index=test.columns ).sort_values(0).to_csv(\"feature_card_{}.csv\".format(str1),encoding=\"GB18030\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def doublePipeline(str1,str2,feature):\n",
    "    card_time1_amount = doubleFeatureDeal(str1,str2,feature)\n",
    "    new_train_test = pd.merge(train_test,card_time1_amount,how='left',on='id')\n",
    "    \n",
    "    train = new_train_test.loc[train_test.money.notnull(),:]\n",
    "    test = new_train_test.loc[train_test.money.isnull(),:]\n",
    "    \n",
    "    #### Oversample\n",
    "    del test[\"money\"]\n",
    "    Oversampling1000 = train.loc[train.money == 1000]\n",
    "    Oversampling1500 = train.loc[train.money == 1500]\n",
    "    Oversampling2000 = train.loc[train.money == 2000]\n",
    "    for i in range(8):\n",
    "        train = train.append(Oversampling1000)\n",
    "    for j in range(11):\n",
    "        train = train.append(Oversampling1500)\n",
    "    for k in range(12):\n",
    "        train = train.append(Oversampling2000)\n",
    "\n",
    "    # model\n",
    "    train['isnull'] =  train.isnull().sum(axis=1)\n",
    "    test['isnull'] = test.isnull().sum(axis=1)\n",
    "    train = train.set_index([\"id\"]).fillna(-1)\n",
    "    test = test.set_index([\"id\"]).fillna(-1)\n",
    "    \n",
    "    clf = GradientBoostingClassifier(n_estimators=100,random_state=2016)\n",
    "    clf = clf.fit(train.iloc[:,1:],train.iloc[:,0])\n",
    "    predictions = clf.predict(test)\n",
    "\n",
    "    # Save results\n",
    "    pred = pd.DataFrame({'studentid': test.reset_index()['id'],\n",
    "                             'subsidy': predictions})\n",
    "    pred.subsidy = pred.subsidy.apply(lambda x:int(x))\n",
    "    print \"数据量:\", train.shape[0], \":\", pred.shape[0]\n",
    "    print \"有奖数量:\", sum(train['money'] != 0), \":\", sum(pred['subsidy'] != 0)\n",
    "    print \"1000奖学金数量：\", sum(train['money'] == 1000), sum(pred['subsidy'] == 1000)\n",
    "    print \"1500奖学金数量：\", sum(train['money'] == 1500), sum(pred['subsidy'] == 1500)\n",
    "    print \"2000奖学金数量：\", sum(train['money'] == 2000), sum(pred['subsidy'] == 2000)\n",
    "    \n",
    "    pd.DataFrame(clf.feature_importances_,index=test.columns ).sort_values(0).to_csv(\"feature_card_{}_{}.csv\".format(str1,str2),encoding=\"GB18030\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "strs = [\"time_MonthofYeah\",\"time_DayofYear\",\"time_DayofMonth\",\"time_DayofWeek\",\"time_HourofDay\"]\n",
    "for str in strs:\n",
    "    singlePipeline(str)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Anaconda\\lib\\site-packages\\ipykernel\\__main__.py:22: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "数据量: 26176 : 10783\n",
      "有奖数量: 16851 : 3126\n",
      "1000奖学金数量： 6669 2065\n",
      "1500奖学金数量： 5580 741\n",
      "2000奖学金数量： 4602 320\n",
      "数据量: 26176 : 10783\n",
      "有奖数量: 16851 : 3192\n",
      "1000奖学金数量： 6669 2003\n",
      "1500奖学金数量： 5580 789\n",
      "2000奖学金数量： 4602 400\n",
      "数据量: 26176 : 10783\n",
      "有奖数量: 16851 : 3121\n",
      "1000奖学金数量： 6669 2091\n",
      "1500奖学金数量： 5580 722\n",
      "2000奖学金数量： 4602 308\n",
      "数据量: 26176 : 10783\n",
      "有奖数量: 16851 : 3163\n",
      "1000奖学金数量： 6669 2053\n",
      "1500奖学金数量： 5580 752\n",
      "2000奖学金数量： 4602 358\n"
     ]
    }
   ],
   "source": [
    "strs1 = [\"time_DayofWeek\",\"time_HourofDay\"]\n",
    "strs2 = [\"how\",\"consume\"]\n",
    "for str1 in strs1:\n",
    "    for str2 in strs2:\n",
    "        doublePipeline(str1,str2,\"amount\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 89,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "card_time_amount = doubleFeatureDeal(\"time_DayofWeek\",\"how\",\"amount\")\n",
    "card_time_amount.loc[:,[\"id\"]+time_DayofWeek_how].to_csv(\"time_DayofWeek_how_amount_top.csv\",encoding=\"GB18030\",index= False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "card_time_amount = doubleFeatureDeal(\"time_DayofWeek\",\"consume\",\"amount\")\n",
    "card_time_amount.loc[:,[\"id\"]+time_DayofWeek_consume].to_csv(\"time_DayofWeek_consume_amount_top.csv\",encoding=\"GB18030\",index= False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "card_time_amount = doubleFeatureDeal(\"time_HourofDay\",\"how\",\"amount\")\n",
    "card_time_amount.loc[:,[\"id\"]+time_HourofDay_how].to_csv(\"time_DayofWeek_consume_amount_top.csv\",encoding=\"GB18030\",index= False)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
